library(knitr)
library(tidyverse)
library(caret)
library(DMwR)
library(rpart)
library(ROCR)
library(randomForest)
library(xgboost)
library(rpart.plot)
#----------------------------------------------------------------
#' #1. Collect and Prepare the Data
#----------------------------------------------------------------
data_orig <- read_csv("case3data.csv")
data <- data_orig
data$grade <- as.factor(data$grade)
data$dropped <- as.factor(data$dropped)
data$ethnicity <- as.factor(data$ethnicity)
data$sex <- as.factor(data$sex)
data$zip<- as.factor(data$zip)
data$subsidizedLunches<- as.factor(data$subsidizedLunches)
data$sanctions <- as.factor(data$sanctions)
data$athleticSeasons <- as.factor(data$athleticSeasons)
data$year <- as.factor(data$year)
data$gpa <- round(data$gpa,3)
data %>%
keep(is.numeric) %>%
gather() %>%
ggplot() +
geom_histogram(mapping = aes(x=value,fill=key), color="black") +
facet_wrap(~ key, scales = "free") +
theme_minimal() +
theme(legend.position = 'none')
data %>%
keep(is.factor) %>%
gather() %>%
group_by(key,value) %>%
summarise(n = n()) %>%
ggplot() +
geom_bar(mapping=aes(x = value, y = n, fill=key), color="black", stat='identity') +
coord_flip() +
facet_wrap(~ key, scales = "free") +
theme_minimal() +
theme(legend.position = 'none')
# Partition the data using caret's createDataPartition() function.
sample <- data %>%
filter(year!=2017)
set.seed(1234)
data.train <- sample
data.train <- SMOTE(dropped ~ ., data.frame(sample), perc.over = 200, perc.under = 250)
data_test <- data %>%
filter(year ==2017)
data.train <- data.train %>%
select(-studentID,-year)
#logistic
logit_mod <-
glm(dropped ~ ., family = binomial(link = 'logit'), data = data.train)
#' View the results of the model.
summary(logit_mod)
##
## Call:
## glm(formula = dropped ~ ., family = binomial(link = "logit"),
## data = data.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5480 -0.5183 -0.1543 0.5529 3.0792
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.086e+00 2.149e-01 32.969 < 2e-16 ***
## grade11thGrade 9.441e-01 1.005e-01 9.396 < 2e-16 ***
## grade12thGrade 6.342e-01 1.082e-01 5.861 4.60e-09 ***
## grade9thGrade -9.329e-01 1.052e-01 -8.871 < 2e-16 ***
## zip15206 -1.478e-01 9.323e-02 -1.585 0.112855
## zip15208 -2.357e-01 1.239e-01 -1.902 0.057178 .
## zip15224 2.110e-02 1.171e-01 0.180 0.857033
## zip15232 1.451e-01 1.406e-01 1.032 0.301992
## ethnicityAsian 3.512e-01 2.558e-01 1.373 0.169834
## ethnicityHispanic 6.025e-01 2.463e-01 2.446 0.014443 *
## ethnicityOther 5.376e-01 1.500e-01 3.585 0.000338 ***
## ethnicityWhite 9.560e-01 1.169e-01 8.178 2.90e-16 ***
## sexM 5.974e-02 6.730e-02 0.888 0.374666
## gpa -2.519e+00 5.927e-02 -42.500 < 2e-16 ***
## subsidizedLunchesNone -1.848e-01 7.966e-02 -2.320 0.020365 *
## subsidizedLunchesPartly 6.946e-02 1.066e-01 0.652 0.514618
## employmentHours -6.308e-04 7.111e-03 -0.089 0.929320
## hrsWifiPerWeek 6.734e-02 9.400e-03 7.164 7.86e-13 ***
## sanctionsnothing -4.506e-01 7.912e-02 -5.695 1.23e-08 ***
## sanctionssuspended 1.021e-01 1.409e-01 0.725 0.468593
## librarySwipesPerWeek -3.323e-01 1.714e-02 -19.380 < 2e-16 ***
## apClasses -3.658e-01 7.266e-02 -5.034 4.79e-07 ***
## athleticSeasons1 -7.491e-01 8.477e-02 -8.837 < 2e-16 ***
## athleticSeasons2 -3.656e-01 1.082e-01 -3.380 0.000724 ***
## athleticSeasons3 1.697e-01 2.091e-01 0.812 0.417007
## athleticSeasons4 2.031e-01 6.017e-01 0.338 0.735668
## athleticSeasons5 -8.536e+00 2.296e+02 -0.037 0.970347
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10246.3 on 7743 degrees of freedom
## Residual deviance: 5660.3 on 7717 degrees of freedom
## AIC: 5714.3
##
## Number of Fisher Scoring iterations: 11
###backward
logit_mod <- step(logit_mod,direction="backward",trace=FALSE)
summary(logit_mod)
##
## Call:
## glm(formula = dropped ~ grade + zip + ethnicity + gpa + subsidizedLunches +
## hrsWifiPerWeek + sanctions + librarySwipesPerWeek + apClasses +
## athleticSeasons, family = binomial(link = "logit"), data = data.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5574 -0.5187 -0.1551 0.5534 3.0695
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.109817 0.211755 33.576 < 2e-16 ***
## grade11thGrade 0.943651 0.100455 9.394 < 2e-16 ***
## grade12thGrade 0.634239 0.108193 5.862 4.57e-09 ***
## grade9thGrade -0.929754 0.102462 -9.074 < 2e-16 ***
## zip15206 -0.147602 0.092604 -1.594 0.110957
## zip15208 -0.235732 0.123398 -1.910 0.056091 .
## zip15224 0.018543 0.115768 0.160 0.872745
## zip15232 0.143855 0.140297 1.025 0.305191
## ethnicityAsian 0.351819 0.255456 1.377 0.168445
## ethnicityHispanic 0.603531 0.246130 2.452 0.014203 *
## ethnicityOther 0.537346 0.149916 3.584 0.000338 ***
## ethnicityWhite 0.954976 0.116815 8.175 2.96e-16 ***
## gpa -2.517968 0.059223 -42.516 < 2e-16 ***
## subsidizedLunchesNone -0.184555 0.079577 -2.319 0.020384 *
## subsidizedLunchesPartly 0.072634 0.106533 0.682 0.495370
## hrsWifiPerWeek 0.067266 0.009395 7.160 8.09e-13 ***
## sanctionsnothing -0.450525 0.079116 -5.694 1.24e-08 ***
## sanctionssuspended 0.103409 0.140836 0.734 0.462797
## librarySwipesPerWeek -0.332091 0.017130 -19.386 < 2e-16 ***
## apClasses -0.365173 0.072632 -5.028 4.96e-07 ***
## athleticSeasons1 -0.748415 0.084718 -8.834 < 2e-16 ***
## athleticSeasons2 -0.365732 0.108113 -3.383 0.000717 ***
## athleticSeasons3 0.173296 0.209279 0.828 0.407636
## athleticSeasons4 0.198680 0.601364 0.330 0.741111
## athleticSeasons5 -8.503953 229.628532 -0.037 0.970458
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10246.3 on 7743 degrees of freedom
## Residual deviance: 5661.1 on 7719 degrees of freedom
## AIC: 5711.1
##
## Number of Fisher Scoring iterations: 11
# LOGISTIC REGRESSION
# Train the model.
data_test <- data_test %>%
select(-year,-studentID)
logit.pred.prob <- predict(logit_mod, data_test, type = 'response')
# Using a decision boundary of 0.5 (i.e If P(y=1|X) > 0.5 then y="Yes" else y="No").
logit.pred <- ifelse(logit.pred.prob > 0.5, "1", "0")
test <- data_test$dropped
pred <- logit.pred
prob <- logit.pred.prob
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
# Get performance metrics.
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(as.factor(pred), test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
#kappa <- kappa2(data.frame(test, pred))$value
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- tibble(approach="Logistic Regression", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
tree.mod <- train(dropped ~ ., data = data.train, method = "rpart")
tree.mod
## CART
##
## 7744 samples
## 12 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 7744, 7744, 7744, 7744, 7744, 7744, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.03546832 0.8432527 0.6603159
## 0.04855372 0.8251994 0.6105566
## 0.52203857 0.7696179 0.4431317
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.03546832.
tree.pred <- predict(tree.mod, data_test)
# View the Confusion Matrix.
confusionMatrix(tree.pred, data_test$dropped, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2760 27
## 1 280 53
##
## Accuracy : 0.9016
## 95% CI : (0.8906, 0.9118)
## No Information Rate : 0.9744
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2246
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.66250
## Specificity : 0.90789
## Pos Pred Value : 0.15916
## Neg Pred Value : 0.99031
## Prevalence : 0.02564
## Detection Rate : 0.01699
## Detection Prevalence : 0.10673
## Balanced Accuracy : 0.78520
##
## 'Positive' Class : 1
##
# Just like before, note that we can obtain predicted classes...
head(predict(tree.mod, data_test, type = "raw"))
## [1] 0 0 0 0 0 0
## Levels: 0 1
# ...as well as the predicted probabilities (with "raw" and "prob", respectively).
head(predict(tree.mod, data_test, type = "prob"))
## 0 1
## 1 0.9351379 0.0648621
## 2 0.9351379 0.0648621
## 3 0.9351379 0.0648621
## 4 0.6503690 0.3496310
## 5 0.9351379 0.0648621
## 6 0.9351379 0.0648621
ctrl <-
trainControl(method = "cv",
number = 10,
selectionFunction = "best")
grid <-
expand.grid(
.model = "tree",
.trials = c(1, 5, 10, 15, 20, 25, 30, 35),
.winnow = FALSE
)
grid <-
expand.grid(
.cp = seq(from=0.0001, to=0.005, by=0.0001)
)
set.seed(1234)
tree.mod <-
train(
dropped ~ .,
data = data.train,
method = "rpart",
metric = "Kappa",
trControl = ctrl,
tuneGrid = grid
)
library(rattle)
fancyRpartPlot(tree.mod$finalModel)
#----------------------------------------------------------------
#' #4. Random Forest
#----------------------------------------------------------------
grid <- expand.grid(.mtry = c(3, 6, 9,12))
ctrl <-
trainControl(method = "cv",
number = 3,
selectionFunction = "oneSE")
set.seed(1234)
rf.mod <-
train(
dropped ~ .,
data = data.train,
method = "rf",
metric = "Kappa",
trControl = ctrl,
tuneGrid = grid
)
rf.mod
## Random Forest
##
## 7744 samples
## 12 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 5163, 5163, 5162
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 3 0.9189054 0.8250410
## 6 0.9342724 0.8596484
## 9 0.9359510 0.8633017
## 12 0.9373718 0.8663367
##
## Kappa was used to select the optimal model using the one SE rule.
## The final value used for the model was mtry = 6.
#----------------------------------------------------------------
#' #5. Extreme Gradient Boosting
#----------------------------------------------------------------
ctrl <-
trainControl(method = "cv",
number = 3,
selectionFunction = "best")
grid <- expand.grid(
nrounds = 20,
max_depth = c(4, 6, 8),
eta = c(0.1, 0.3, 0.5),
gamma = 0.01,
colsample_bytree = 1,
min_child_weight = 1,
subsample = c(0.5, 1)
)
set.seed(1234)
xgb.mod <-
train(
dropped ~ .,
data = data.train,
method = "xgbTree",
metric = "Kappa",
trControl = ctrl,
tuneGrid = grid
)
xgb.mod
## eXtreme Gradient Boosting
##
## 7744 samples
## 12 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 5163, 5163, 5162
## Resampling results across tuning parameters:
##
## eta max_depth subsample Accuracy Kappa
## 0.1 4 0.5 0.9106413 0.8091037
## 0.1 4 1.0 0.9084452 0.8044658
## 0.1 6 0.5 0.9221339 0.8340878
## 0.1 6 1.0 0.9252328 0.8404719
## 0.1 8 0.5 0.9253619 0.8410903
## 0.1 8 1.0 0.9265235 0.8433194
## 0.3 4 0.5 0.9275577 0.8452340
## 0.3 4 1.0 0.9300109 0.8505006
## 0.3 6 0.5 0.9341434 0.8591054
## 0.3 6 1.0 0.9402123 0.8722569
## 0.3 8 0.5 0.9342723 0.8594770
## 0.3 8 1.0 0.9409867 0.8738365
## 0.5 4 0.5 0.9331099 0.8569963
## 0.5 4 1.0 0.9377585 0.8669107
## 0.5 6 0.5 0.9334974 0.8580391
## 0.5 6 1.0 0.9420202 0.8760266
## 0.5 8 0.5 0.9338844 0.8584468
## 0.5 8 1.0 0.9426655 0.8773545
##
## Tuning parameter 'nrounds' was held constant at a value of 20
##
## Tuning parameter 'colsample_bytree' was held constant at a value of
## 1
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 20, max_depth = 8,
## eta = 0.5, gamma = 0.01, colsample_bytree = 1, min_child_weight = 1
## and subsample = 1.
#----------------------------------------------------------------
#' #6. Compare Model Performance
#----------------------------------------------------------------
#' ##Logistic Regression
# Train the model.
logit.mod <-
glm(dropped ~ ., family = binomial(link = 'logit'), data = data.train)
logit.pred.prob <- predict(logit.mod, data_test, type = 'response')
logit.pred <- as.factor(ifelse(logit.pred.prob > 0.5, "1", "0"))
test <- data_test$dropped
pred <- logit.pred
prob <- logit.pred.prob
# Plot ROC Curve.
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
# Get performance metrics.
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- tibble(approach="Logistic Regression", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
#' ##Classification Tree.
tree.pred <- predict(tree.mod, data_test, type = "raw")
tree.pred.prob <- predict(tree.mod, data_test, type = "prob")
test <- data_test$dropped
pred <- tree.pred
prob <- tree.pred.prob[,c("1")]
# Plot ROC Curve.
# dev.off()
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
# Get performance metrics.
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
add_row(approach="Decision Tree", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
##randomforest
rf.pred <- predict(rf.mod, data_test, type = "raw")
rf.pred.prob <- predict(rf.mod, data_test, type = "prob")
test <- data_test$dropped
pred <- rf.pred
prob <- rf.pred.prob[,c("1")]
# Plot ROC Curve.
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
# Get performance metrics.
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
add_row(approach="Random Forest", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
#xgboost
xgb.pred <- predict(xgb.mod, data_test, type = "raw")
xgb.pred.prob <- predict(xgb.mod, data_test, type = "prob")
test <- data_test$dropped
pred <- xgb.pred
prob <- xgb.pred.prob[,c("1")]
# Plot ROC Curve.
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
# Get performance metrics.
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
add_row(approach="Extreme Gradient Boosting", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
#' ##Output Comparison Table.
kable(comparisons)
| approach | accuracy | fmeasure | kappa | auc |
|---|---|---|---|---|
| Logistic Regression | 0.9006410 | 0.2757009 | 0.2441860 | 0.9189638 |
| Decision Tree | 0.9240385 | 0.3680000 | 0.3414321 | 0.9655016 |
| Random Forest | 0.9339744 | 0.4046243 | 0.3801882 | 0.9729811 |
| Extreme Gradient Boosting | 0.9378205 | 0.4085366 | 0.3846779 | 0.9752673 |
# logistic
test <- data_test$dropped
pred <- logit.pred
prob <- logit.pred.prob
# Plot ROC Curve.
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, main = "ROC Curve for Drop-out Rate Prediction Approaches",col=3, lwd = 2)+abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
## integer(0)
# Plot ROC Curve.
tree.pred <- predict(tree.mod, data_test, type = "raw")
tree.pred.prob <- predict(tree.mod, data_test, type = "prob")
test <- data_test$dropped
pred <- tree.pred
prob <- tree.pred.prob[,c("1")]
# Plot ROC Curve.
# dev.off()
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=2, lwd = 2, add=TRUE)
#' ##Random Forest.
rf.pred <- predict(rf.mod, data_test, type = "raw")
rf.pred.prob <- predict(rf.mod, data_test, type = "prob")
test <- data_test$dropped
pred <- rf.pred
prob <- rf.pred.prob[,c("1")]
# Plot ROC Curve.
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=4, lwd = 2, add=TRUE)
#' ##XGBoost.
xgb.pred <- predict(xgb.mod, data_test, type = "raw")
xgb.pred.prob <- predict(xgb.mod, data_test, type = "prob")
test <- data_test$dropped
pred <- xgb.pred
prob <- xgb.pred.prob[,c("1")]
# Plot ROC Curve.
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=5, lwd = 2, add=TRUE)
# Draw ROC legend.
legend(0.6, 0.6, c('Decision Tree','Logistic Regression', 'Random Forest', 'Extreme Gradient Boosting'), 2:5)